---
title: "Replication code for The institutional impacts of algorithmic distribution: Facebook and the Australian news media"
author: "Francesco Bailo, James Meese, Eddy Hurcombe"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  pdf_document:
    keep_tex: true
    includes:
      in_header: "preamble.tex"
---

# Libraries

```{r load-packages, message=FALSE, warning=F}

required_packages <- 
  c("tidyverse", "readxl", "zoo", "parallel", "kableExtra")

new_packages <- 
  required_packages[!(required_packages %in% 
                       installed.packages()[,"Package"])]

if(length(new_packages)) install.packages(new_packages)

lapply(required_packages, require, character.only = TRUE)

sessionInfo()

```

# Setup

```{r setup}

ggplot2::theme_set(theme_bw())

knitr::opts_chunk$set(
  dev = "pdf", fig.width = 8, fig.height = 7, dpi=300, message=FALSE, warning=FALSE
)

```

# Traffic data

```{r fig-traffic-data-all, fig.cap = "Figure 1: Traffic share (by source type, mean)",  width = 8, height = 7}

dat <- 
  read.csv("data-semrush.csv")

dat$date <- 
  as.Date(dat$date)

dat <- 
  dat[!is.na(dat$date),]

dat$domain[dat$domain == 't.co'] <- 
  "twitter.com"

labelling_list <- c(abc = "abcnews.au",
                    adelaideadvertiser = "theadvertiser", 
                    afr = "financialreview",
                    brisbanetimes = "brisbanetimes",
                    buzzfeed = "BuzzFeedOz",
                    canberratimes = "CanberraTimes",
                    couriermail = "couriermail",
                    crikey = "crikey.com.au",
                    dailytele = "dailytelegraph", 
                    diewelt = "NA",
                    heraldsun = "heraldsun",
                    huffingtonpostau = "HuffPostAustralia",
                    junkee = "junkeedotcom",
                    mamamia = "mamamia",
                    newmatilda = "newmatildadotcom",
                    newscomau = "news.com.au",
                    nine = "Ninecomau", #?
                    ntnews = "TheNTNews",
                    ped = "pedestriandottv",
                    sbs = "sbsnews",
                    smh = "sydneymorningherald",
                    svd = "NA",
                    telegraphuk = "NA",
                    theaustralian = "theaustralian",
                    theguardian = "theguardianaustralia",
                    wapo = "NA",
                    watoday = "WAtoday",
                    `the new daily` = "TheNewDaily", # Digital
                    `the west` = "thewestaustralian", #  Print
                    theage = "theageAustralia"
)

dat$publication_join <- 
  labelling_list[match(dat$publication, 
                       names(labelling_list))]

Categoryforanalysis <- 
  read_excel("Categoryforanalysis.xlsx")

dat$category <- 
  Categoryforanalysis$Category[match(
    dat$publication_join,
    gsub("https://www.facebook.com/|/$", "", Categoryforanalysis$`Site/Publication`))]

dat$source_type[dat$search_engine == "Google" & dat$source_type == "search"] <- 
  "Google search"
dat$source_type[dat$search_engine != "Google" & dat$source_type == "search"] <- 
  "other search"  

fig1_dat <- 
  dat %>%
  dplyr::filter(date < as.Date("2020-01-01") & !is.na(category)) %>%
  dplyr::mutate(date = factor(format(date, "%b %Y"), levels = c("Oct 2017",
                                                                "Nov 2017",
                                                                "Dec 2017",
                                                                "Jan 2018",
                                                                "Feb 2018",
                                                                "Mar 2018"))) %>%
  dplyr::group_by(publication, category, date, source_type) %>%
  dplyr::summarise(traffic_share = sum(traffic_share, na.rm = T)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(category, date, source_type) %>%
  dplyr::summarise(traffic_share = mean(traffic_share, na.rm = T))

 fig1_dat %>%
  ggplot(aes(x=date, y=traffic_share, fill = source_type)) +
  scale_fill_brewer(palette = "Set3") +
  scale_y_continuous(label = function(x) paste0(x*100,"%")) +
  geom_bar(stat = 'identity') +
  facet_wrap(category~., scales = "free_y") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  labs(x = NULL, y = "traffic share", fill = "source type")

```

```{r tab-traffic-data-all}

write.csv(fig1_dat, file = 'out/fig1_dat.csv')

```


```{r fig-traffic-data-social-fb, fig.cap = "Figure 7: Traffic share (social and Facebook.com only, mean)",  fig.width = 8, fig.height = 3.5}

fig7_dat <- 
  dat %>%
  dplyr::filter(date < as.Date("2020-01-01") & !is.na(category)) %>%
  dplyr::mutate(date = factor(format(date, "%b %Y"), levels = c("Oct 2017",
                                                                "Nov 2017",
                                                                "Dec 2017",
                                                                "Jan 2018",
                                                                "Feb 2018",
                                                                "Mar 2018"))) %>%
  dplyr::filter(domain == "facebook.com") %>%
  dplyr::group_by(category, publication, date) %>%
  dplyr::summarise(traffic_share = sum(traffic_share, na.rm = T)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(category, date) %>%
  dplyr::summarise(traffic_share = mean(traffic_share, na.rm = T))

 fig7_dat %>%
  ggplot(aes(x=date, y=traffic_share)) +
  geom_bar(stat = 'identity') +
  scale_y_continuous(label = function(x) paste0(x*100,"%")) +
  facet_wrap(category~., scales = "free_y", ncol = 4) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  labs(x = NULL, y = "traffic share")

```

```{r tab-traffic-data-social-fb}

write.csv(fig7_dat, file = 'out/fig7_dat.csv')

```

```{r fig-traffic-data-fb-selected, fig.cap = "Figure 8: Traffic share (social and Facebook.com only, sum)", fig.width = 9, fig.height = 3.5}

fig8_dat <- 
  dat %>%
  dplyr::select(-X) %>%
  dplyr::distinct() %>%
  dplyr::filter(domain == "facebook.com" &  
                  publication %in% c("abc", "ped", "buzzfeed", "heraldsun", "theguardian")) %>%
  dplyr::filter(date < as.Date("2020-01-01") & !is.na(category)) %>%
  dplyr::mutate(date = factor(format(date, "%b %Y"), levels = c("Oct 2017",
                                                                "Nov 2017",
                                                                "Dec 2017",
                                                                "Jan 2018",
                                                                "Feb 2018",
                                                                "Mar 2018")),
                publication = recode(publication, sbs = "SBS News", ped = "Pedestrian",
                                     heraldsun = "Herald Sun", theguardian = "Guardian",
                                     )) %>%
  dplyr::group_by(publication, date) %>%
  dplyr::summarise(traffic_share = sum(traffic_share, na.rm = T)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(publication) %>% 
  dplyr::arrange(date) %>%
  # dplyr::mutate(traffic_share = traffic_share / traffic_share[1]) %>%
  dplyr::arrange(publication)

 fig8_dat %>%
  ggplot(aes(x=date, y=traffic_share)) +
  # geom_hline(yintercept = 1) +
  geom_bar(stat='identity') +
  scale_y_continuous(label = function(x) paste0(x*100,"%")) +
  facet_wrap(publication~., scales = "free_y", nrow = 1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  labs(x = NULL, y = "traffic share")

```

```{r tab-traffic-data-fb-selected}

write.csv(fig8_dat, file = 'out/fig8_dat.csv')

```

# CrowdTangle data

```{r load-ct-data}

load("australian_media_list.df.RData")

load("ct_au_media_dat.RData")

```


Data timeframe: `r as.Date(min(dat$posix))` - `r as.Date(max(dat$posix))`

## Functions

```{r performance-score-functions}

benchmarking <- function(x) {
  x <- x[x > quantile(x, probs = .25) &
           x < quantile(x, probs = .75)]
  return(mean(x, na.rm =T))
}

computePerfScoreInteractions <- function(i) {
  
  this_dat <-
    dat_wt_benchmark %>%
    dplyr::filter(day >= this_seq[i] - 14 &
                    day < this_seq[i] + 15) %>%
    dplyr::group_by(Page.Name, page_type) %>%
    dplyr::summarize(Performance.Score =
                       benchmarking(Interactions) /
                       benchmark.score[1])
  this_dat$day <-
    this_seq[i]
  
  return(this_dat)
}

computePerfScoreComments <- function(i) {
  
  this_dat <-
    dat_wt_benchmark %>%
    dplyr::filter(day >= this_seq[i] - 14 &
                    day < this_seq[i] + 15) %>%
    dplyr::group_by(Page.Name, page_type) %>%
    dplyr::summarize(Performance.Score =
                       benchmarking(Comments) /
                       benchmark.score[1])
  this_dat$day <-
    this_seq[i]
  
  return(this_dat)
}

computePerfScoreShares <- function(i) {
  
  this_dat <-
    dat_wt_benchmark %>%
    dplyr::filter(day >= this_seq[i] - 14 &
                    day < this_seq[i] + 15) %>%
    dplyr::group_by(Page.Name, page_type) %>%
    dplyr::summarize(Performance.Score =
                       benchmarking(Shares) /
                       benchmark.score[1])
  this_dat$day <-
    this_seq[i]
  
  return(this_dat)
}

```

## Time frame of analysis

```{r analysis-time-frame}

benchmarking_timeframe <- 
  c(as.Date("2017-01-01"), as.Date("2017-04-01"))

date_limits <- 
  c(as.Date("2014-01-01"), as.Date("2020-12-15"))

```

# Posting frequency

```{r fig-posting-freq, fig.cap = "Figure 9: Number of postings (7-day moving average)", fig.width = 8, fig.height = 3.5}

dat %>%
  dplyr::filter(Page.Name %in% c("Daily Mail Australia", 
                                 "ABC News", "Mamamia", 
                                 "Guardian Australia"),
                posix >= as.Date("2017-09-01") &
                  posix < as.Date("2018-12-01")) %>%
  dplyr::group_by(Page.Name, date = as.Date(posix)) %>%
  dplyr::count() %>%
  dplyr::ungroup() %>%
  dplyr::group_by(Page.Name) %>%
  dplyr::arrange(date) %>%
  dplyr::mutate(n_ma7 = rollmean(n, 7, fill = NA)) %>%
  ggplot(aes(x = date, y = n_ma7)) +
  geom_line() +
  facet_wrap(Page.Name~., ncol = 4) +
  labs(x = NULL, y = "postings") +
  scale_x_date(date_labels = "%b %Y", limits = as.Date(c("2017-10-01", "2018-10-31"))) +
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
  
```



# Weighting

```{r weighting}

weighting_by_type_on_links.df <- 
  dat %>%
  dplyr::filter(Type == "Link") %>%
  dplyr::group_by(Page.Name, page_type, day, .drop = FALSE) %>%
  dplyr::count() %>%
  dplyr::ungroup() %>%
  dplyr::group_by(Page.Name) %>%
  dplyr::arrange(day) %>%
  dplyr::mutate(n_ma30 = rollmean(n, k = 30, fill = NA, align = 'center')) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(day) %>%
  dplyr::mutate(global_weight = n_ma30 / sum(n_ma30)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(page_type, day) %>%
  dplyr::mutate(page_type_weight = n_ma30 / sum(n_ma30)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(Page.Name) %>%
  dplyr::arrange(day) %>%
  dplyr::mutate()

```

```{r weighting-example}

weighting_by_type_on_links.df %>%
  dplyr::filter(day == "2020-01-01") %>%
  dplyr::arrange(page_type, desc(global_weight)) %>%
  dplyr::mutate(global_weight = round(global_weight*100,2),
                page_type_weight = round(page_type_weight*100,2)) %>%
  # write.csv(file = "table8.csv", row.names = FALSE) %>%
  kable(booktabs = T, caption = "") %>%
  kable_styling(latex_options = c("striped", "scale_down"))

```


## Interactions

```{r interactions, eval = F}

{ 
  sink("/dev/null");
  cl <- makeCluster(10)
  clusterEvalQ(cl, {
    library(dplyr)
    library(magrittr)
  })
}

clusterExport(cl, "benchmarking")

benchmark <- 
  dat %>%
  dplyr::filter(posix >= benchmarking_timeframe[1] &
                  posix <= benchmarking_timeframe[2]) %>%
  dplyr::group_by(Page.Name, Type) %>%
  dplyr::summarise(benchmark.score = 
                     benchmarking(Likes+Love+Wow+Haha+Sad+Angry+Care))

dat_wt_benchmark <-
  merge(dat, benchmark, 
        by = c("Page.Name","Type"))

dat_wt_benchmark$Interactions <- 
  dat_wt_benchmark$Likes +
  dat_wt_benchmark$Love +
  dat_wt_benchmark$Wow +
  dat_wt_benchmark$Haha +
  dat_wt_benchmark$Sad +
  dat_wt_benchmark$Angry +
  dat_wt_benchmark$Care

this_seq <- seq(date_limits[1], 
                date_limits[2], by = "day")

clusterExport(cl, "this_seq")

dat_wt_benchmark <-
  dat_wt_benchmark %>%
  dplyr::filter(Type %in% c("Link")) %>%
  dplyr::mutate(day = as.Date(posix)) %>%
  dplyr::select(Page.Name, page_type, Interactions, benchmark.score, day) 

clusterExport(cl, "dat_wt_benchmark")

par_res <- 
  parLapply(cl, 1:length(this_seq), fun = computePerfScoreInteractions)

interactions_ma30_dat <-
  bind_rows(par_res)

stopCluster(cl)

interactions_dat_wt_benchmark <- 
  dat_wt_benchmark

save(interactions_dat_wt_benchmark, file = "interactions_dat_wt_benchmark.RData")
save(interactions_ma30_dat, file = "interactions_ma30_dat.RData")

```


```{r fig-interactions-perf-score, fig.cap = "Performance score on interactions"}

load("interactions_ma30_dat.RData")

interactions_ma30_dat %>%
  ggplot(aes(x = day, y = Performance.Score, colour = page_type)) +
  geom_line() + 
  geom_smooth() +
  facet_wrap(Page.Name~., scales = "free_y", ncol = 6) +
  geom_hline(yintercept = 1) +
  theme(text = element_text(size=10),
        axis.text.x = element_text(angle=90, hjust=1),
        legend.position = 'bottom') + 
  labs(x = NULL, y = NULL, colour = "page type")

```

### Clustered (weighted average)

```{r interactions-clustered}

interactions_ma30_dat <- 
  merge(interactions_ma30_dat,
        weighting_by_type_on_links.df %>%
          dplyr::select(Page.Name, day, global_weight, page_type_weight),
        by = c("Page.Name", "day"))

interactions_ma30_dat %>%
  dplyr::group_by(day, page_type) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = page_type_weight, na.rm = T)) %>%
  dplyr::filter(day %in% as.Date(c("2014-12-31",
                                   "2015-12-31",
                                   "2016-12-31",
                                   "2017-12-31",
                                   "2018-12-31",
                                   "2019-12-31",
                                   "2020-10-01"))) %>%
  dplyr::arrange(page_type, day) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```


```{r fig-interactions-perf-score-clustered, fig.cap = "Figure 3: Performance score on interactions", fig.width=5, fig.height=4}

interactions_ma30_dat %>%
  dplyr::group_by(day, page_type) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = page_type_weight, na.rm = T)) %>%
  ggplot(aes(x = day, 
             y = Performance.Score, 
             colour = page_type)) +
  geom_line() + 
  geom_smooth() +
  facet_wrap(page_type~., ncol = 1, scales = "free_y") +
  theme(strip.text.x = element_text(size = 8, margin = margin(0.05,0,0.05,0, "cm"))) +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = 1) +
  labs(x = NULL, y = NULL) +
  guides(colour = FALSE)

```


```{r, fig.cap = "Proportion of links above the early-2017 benchmark"}

load("interactions_dat_wt_benchmark.RData")

interactions_dat_wt_benchmark %>%
  dplyr::filter(page_type %in% c("Public Service Media")) %>%
  dplyr::group_by(Page.Name, day) %>%
  dplyr::summarise(perc.above.benchmark = sum(Interactions > benchmark.score) / n()) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(Page.Name) %>%
  dplyr::arrange(day) %>%
  dplyr::mutate(perc.above.benchmark.30ma = rollmean(perc.above.benchmark, k = 30, fill = NA)) %>%
  ggplot(aes(x = day, y = perc.above.benchmark.30ma)) +
  scale_y_continuous(label = function(x) paste0(x*100,"%")) +
  geom_line() +
  geom_vline(xintercept = as.Date("2018-06-08"), colour = "red") +
  
  geom_vline(xintercept = as.Date("2018-08-02"), colour = "blue") +
  
  facet_wrap(Page.Name ~., ncol = 1) +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = .5) +
  labs(x = NULL, y = NULL)
```


```{r eval = F}

tmp <- 
  dat %>%
  dplyr::filter(Page.Name == "ABC News" & 
                  posix >= as.Date("2018-06-08") - 40  &
                  posix <= as.Date("2018-06-08") + 40) %>%
  dplyr::mutate(Overperforming.Score = as.numeric(Overperforming.Score)) %>%
  dplyr::select(Page.Name, URL, Type, Message, Overperforming.Score, Total.Interactions)

WriteXLS::WriteXLS(tmp, ExcelFileName = "abcnews_postings.xls")

tmp <- 
  dat %>%
  dplyr::filter(Page.Name == "SBS News" & 
                  posix >= as.Date("2018-08-02") - 40  &
                  posix <= as.Date("2018-08-02") + 40) %>%
  dplyr::mutate(Overperforming.Score = as.numeric(Overperforming.Score)) %>%
  dplyr::select(Page.Name, URL, Type, Message, Overperforming.Score, Total.Interactions)

WriteXLS::WriteXLS(tmp, ExcelFileName = "sbsnews_postings.xls")

```



### Industry-wide (weighted average)

```{r interactions-industry-wide}

interactions_ma30_dat %>%
  dplyr::group_by(day) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = global_weight, na.rm = T)) %>%
  dplyr::filter(day %in% as.Date(c("2014-12-31",
                                   "2015-12-31",
                                   "2016-12-31",
                                   "2017-12-31",
                                   "2018-12-31",
                                   "2019-12-31",
                                   "2020-10-01"))) %>%
  dplyr::arrange(day) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```

```{r fig-interactions-perf-score-industry-wide, fig.cap = "Figure 2: Performance score on interactions", fig.width = 5, fig.height = 3}

interactions_ma30_dat %>%
  dplyr::group_by(day) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = global_weight, na.rm = T)) %>%
  ggplot(aes(x = day, 
             y = Performance.Score)) +
  geom_line() + 
  geom_smooth() +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = 1) +
  labs(x = NULL, y = NULL)

```

## Comments 

```{r comments, eval = F}

{ 
  sink("/dev/null");
  cl <- makeCluster(10)
  clusterEvalQ(cl, {
    library(dplyr)
    library(magrittr)
  })
}


clusterExport(cl, "benchmarking")

benchmark <- 
  dat %>%
  dplyr::filter(posix >= benchmarking_timeframe[1] &
                  posix <= benchmarking_timeframe[2]) %>%
  dplyr::group_by(Page.Name, Type) %>%
  dplyr::summarise(benchmark.score = 
                     benchmarking(Comments))

dat_wt_benchmark <-
  merge(dat, benchmark, 
        by = c("Page.Name","Type"))

this_seq <- seq(date_limits[1], 
                date_limits[2], by = "day")
clusterExport(cl, "this_seq")

dat_wt_benchmark <-
  dat_wt_benchmark %>%
  dplyr::filter(Type %in% c("Link")) %>%
  dplyr::mutate(day = as.Date(posix)) %>%
  dplyr::select(Page.Name, page_type, Comments, benchmark.score, day)

clusterExport(cl, "dat_wt_benchmark")

par_res <- 
  parLapply(cl, 1:length(this_seq), fun = computePerfScoreComments)

comments_ma30_dat <-
  bind_rows(par_res)

stopCluster(cl)

save(comments_ma30_dat, file = "comments_ma30_dat.RData")

```

```{r fig-comments-perf-score, fig.cap = "Performance score on comments"}

load("comments_ma30_dat.RData")  

comments_ma30_dat %>%
  ggplot(aes(x = day, y = Performance.Score, colour = page_type)) +
  geom_line() + 
  geom_smooth() +
  facet_wrap(Page.Name~., scales = "free_y", ncol = 6) +
  geom_hline(yintercept = 1) +
  theme(text = element_text(size=10),
        axis.text.x = element_text(angle=90, hjust=1),
        legend.position = 'bottom') + 
  labs(x = NULL, y = NULL)

```

### Clustered (weighted average)

```{r comments-clustered}

comments_ma30_dat <- 
  merge(comments_ma30_dat,
        weighting_by_type_on_links.df %>%
          dplyr::select(Page.Name, day, global_weight, page_type_weight),
        by = c("Page.Name", "day"))

comments_ma30_dat %>%
  dplyr::group_by(day, page_type) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = page_type_weight, na.rm = T)) %>%
  dplyr::filter(day %in% as.Date(c("2014-12-31",
                                   "2015-12-31",
                                   "2016-12-31",
                                   "2017-12-31",
                                   "2018-12-31",
                                   "2019-12-31",
                                   "2020-10-01"))) %>%
  dplyr::arrange(page_type, day) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```

```{r fig-comments-perf-score-clustered, fig.cap = "Performance score on comments", fig.width=5, fig.height=4}

comments_ma30_dat %>%
  dplyr::group_by(day, page_type) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = page_type_weight, na.rm = T)) %>%
  ggplot(aes(x = day, 
             y = Performance.Score, 
             colour = page_type)) +
  geom_line() + 
  geom_smooth() +
  facet_wrap(page_type~., ncol = 1, scales = "free_y") +
  theme(strip.text.x = element_text(size = 8, margin = margin(0.05,0,0.05,0, "cm"))) +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = 1) +
  labs(x = NULL, y = NULL, colour = "page type") +
  guides(colour = FALSE)

```

### Industry-wide (weighted average)

```{r comments-industry-wide}

comments_ma30_dat %>%
  dplyr::group_by(day) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = global_weight, na.rm = T)) %>%
  dplyr::filter(day %in% as.Date(c("2014-12-31",
                                   "2015-12-31",
                                   "2016-12-31",
                                   "2017-12-31",
                                   "2018-12-31",
                                   "2019-12-31",
                                   "2020-10-01"))) %>%
  dplyr::arrange(day) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```

```{r fig-comments-perf-score-industry-wide, fig.cap = "Figure 4: Performance score on comments", fig.width = 5, fig.height = 3}

comments_ma30_dat %>%
  dplyr::group_by(day) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = global_weight, na.rm = T)) %>%
  ggplot(aes(x = day, 
             y = Performance.Score)) +
  geom_line() + 
  geom_smooth() +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = 1) +
  labs(x = NULL, y = NULL, colour = "page type")

```

## Shares 

```{r shares, eval = F}

{ 
  sink("/dev/null");
  cl <- makeCluster(10)
  clusterEvalQ(cl, {
    library(dplyr)
    library(magrittr)
  })
}

clusterExport(cl, "benchmarking")

benchmark <- 
  dat %>%
  dplyr::filter(posix >= benchmarking_timeframe[1] &
                  posix <= benchmarking_timeframe[2]) %>%
  dplyr::group_by(Page.Name, Type) %>%
  dplyr::summarise(benchmark.score = 
                     benchmarking(Shares))

dat_wt_benchmark <-
  merge(dat, benchmark, 
        by = c("Page.Name","Type"))

this_seq <- seq(date_limits[1], 
                date_limits[2], by = "day")
clusterExport(cl, "this_seq")

dat_wt_benchmark <-
  dat_wt_benchmark %>%
  dplyr::filter(Type %in% c("Link")) %>%
  dplyr::mutate(day = as.Date(posix)) %>%
  dplyr::select(Page.Name, page_type, Shares, benchmark.score, day) 
clusterExport(cl, "dat_wt_benchmark")

par_res <- 
  parLapply(cl, 1:length(this_seq), fun = computePerfScoreShares)

shares_ma30_dat <-
  bind_rows(par_res)

stopCluster(cl)

save(shares_ma30_dat, file = "shares_ma30_dat.RData")

```


```{r fig-shares-perf-score, fig.cap = "Performance score on shares"}

load("shares_ma30_dat.RData")

shares_ma30_dat %>%
  ggplot(aes(x = day, y = Performance.Score, colour = page_type)) +
  geom_line() + 
  geom_smooth() +
  facet_wrap(Page.Name~., scales = "free_y", ncol = 6) +
  geom_hline(yintercept = 1) +
  theme(text = element_text(size=10),
        axis.text.x = element_text(angle=90, hjust=1),
        legend.position = 'bottom') + 
  labs(x = NULL, y = NULL, colour = "page type")

```


### Clustered (weighted average)

```{r shares-clustered}

shares_ma30_dat <- 
  merge(shares_ma30_dat,
        weighting_by_type_on_links.df %>%
          dplyr::select(Page.Name, day, global_weight, page_type_weight),
        by = c("Page.Name", "day"))

shares_ma30_dat %>%
  dplyr::group_by(day, page_type) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = page_type_weight, na.rm = T)) %>%
  dplyr::filter(day %in% as.Date(c("2014-12-31",
                                   "2015-12-31",
                                   "2016-12-31",
                                   "2017-12-31",
                                   "2018-12-31",
                                   "2019-12-31",
                                   "2020-10-01"))) %>%
  dplyr::arrange(page_type, day) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```

```{r fig-shares-perf-score-clustered, fig.cap = "Figure 6: Performance score on shares", fig.width=5, fig.height=4}

shares_ma30_dat %>%
  dplyr::group_by(day, page_type) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = page_type_weight, na.rm = T)) %>%
  ggplot(aes(x = day, 
             y = Performance.Score, 
             colour = page_type)) +
  geom_line() + 
  geom_smooth() +
  facet_wrap(page_type~., ncol = 1, scales = "free_y") +
  theme(strip.text.x = element_text(size = 8, margin = margin(0.05,0,0.05,0, "cm"))) +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = 1) +
  labs(x = NULL, y = NULL, colour = "page type") +
  guides(colour = FALSE)

```

### Industry-wide (weighted average)

```{r shares-industry-wide}

shares_ma30_dat %>%
  dplyr::group_by(day) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = global_weight, na.rm = T)) %>%
  dplyr::filter(day %in% as.Date(c("2014-12-31",
                                   "2015-12-31",
                                   "2016-12-31",
                                   "2017-12-31",
                                   "2018-12-31",
                                   "2019-12-31",
                                   "2020-10-01"))) %>%
  dplyr::arrange(day) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```


```{r fig-shares-perf-score-industry-wide, fig.cap = "Figure 5: Performance score on shares", fig.width = 5, fig.height = 3}

shares_ma30_dat %>%
  dplyr::group_by(day) %>%
  dplyr::summarize(Performance.Score =
                     weighted.mean(Performance.Score, 
                                   w = global_weight, na.rm = T)) %>%
  ggplot(aes(x = day, 
             y = Performance.Score)) +
  geom_line() + 
  geom_smooth() +
  theme(legend.position = 'bottom') +
  geom_hline(yintercept = 1) +
  labs(x = NULL, y = NULL)

```

## Stats

```{r stats}

interactions_ma30_dat %>%
  dplyr::summarize(w.mean.2020 = 
                     weighted.mean(Performance.Score[day == "2020-11-15"], 
                                   w = global_weight[day == "2020-11-15"], 
                                   na.rm = T),
                   w.mean.2014 = weighted.mean(Performance.Score[day == "2014-11-15"], 
                                               w = global_weight[day == "2014-11-15"], 
                                               na.rm = T)) %>%
  dplyr::mutate(w.mean.2020 / w.mean.2014) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

comments_ma30_dat %>%
  dplyr::summarize(w.mean.2020 = 
                     weighted.mean(Performance.Score[day == "2020-11-15"], 
                                   w = global_weight[day == "2020-11-15"], 
                                   na.rm = T),
                   w.mean.2014 = weighted.mean(Performance.Score[day == "2014-11-15"], 
                                               w = global_weight[day == "2014-11-15"], 
                                               na.rm = T)) %>%
  dplyr::mutate(w.mean.2020 / w.mean.2014)  %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

shares_ma30_dat %>%
  dplyr::summarize(w.mean.2020 = 
                     weighted.mean(Performance.Score[day == "2020-11-15"], 
                                   w = global_weight[day == "2020-11-15"], 
                                   na.rm = T),
                   w.mean.2014 = weighted.mean(Performance.Score[day == "2014-11-15"], 
                                               w = global_weight[day == "2014-11-15"], 
                                               na.rm = T)) %>%
  dplyr::mutate(w.mean.2020 / w.mean.2014) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

shares_ma30_dat %>%
  dplyr::group_by(page_type) %>%
  dplyr::summarize(w.mean.2020 = 
                     weighted.mean(Performance.Score[day == "2020-11-15"], 
                                   w = page_type_weight[day == "2020-11-15"], 
                                   na.rm = T),
                   w.mean.2014 = weighted.mean(Performance.Score[day == "2014-11-15"], 
                                               w = page_type_weight[day == "2014-11-15"], 
                                               na.rm = T)) %>%
  dplyr::mutate(w.mean.2020 / w.mean.2014) %>%
  kable(booktabs = T, caption="") %>%
  kable_styling(latex_options = "striped")

```


